# Necessary packaces are imported
require(data.table)
## Loading required package: data.table
require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
require(forecast)
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
require(skimr)
## Loading required package: skimr
require(repr)
## Loading required package: repr
require(ggplot2)
## Loading required package: ggplot2
require(readxl)
## Loading required package: readxl
require(GGally)
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
require(corrplot)
## Loading required package: corrplot
## corrplot 0.92 loaded

Current Account

Motivation

Current account basically shows the net amount of monetary operations made by a country. It is one of the most important macroeconomic indicators showing the well-being of an economy. As it affects the other economic policies of the country, it is affected by many indicators as well. The independent variables which are going to be used during time series analysis is listed below:

  • Brent Petrol Price Index: Trade balance and energy production/consumption is main parameter determining the current account level. Brent petrol prices can be a significant measure considering it leads to changes in both trade balance and cost of energy consumption.

  • Unemployement: Countries which can not produce its own consumptions have to import their goods and services from other countries which is directly related to current account. Unemployement is going to be a measurement to integrate the effect of the both production fo goods and services and the human capacity. (Labor force could also be used.)

  • Exchange Rate: In countries with high exchange rate is unlikely to import goods and services and also likely to increase the export the goods and services due to low cost.

  • Central Bank Reserve: Current account shows if a country is a buyer or a seller. Considering this, if a country is buyer this will consume its reserves and it is a seller its reserves should increase.

Importing Data

c_account_all_data <- read_excel("~/Downloads/EVDS-10.xlsx")
c_account_all_data <- data.table::as.data.table(c_account_all_data)

colnames(c_account_all_data) <- c("Date", "brent_idx", "c_account", "reserve", "unemployement", "ex_rate" )
c_account_all_data$Date <- as.Date(c_account_all_data$Date)

head(c_account_all_data)
##          Date brent_idx c_account reserve unemployement  ex_rate
##        <Date>     <num>     <num>   <num>         <num>    <num>
## 1: 2014-01-01    108.16     -4519  124270          10.5 2.216836
## 2: 2014-02-01    108.98     -2703  127691          10.5 2.212760
## 3: 2014-03-01    105.95     -3264  126051           9.4 2.217795
## 4: 2014-04-01    108.63     -4201  129732           8.8 2.127471
## 5: 2014-05-01    109.21     -2709  130591           8.4 2.090805
## 6: 2014-06-01    111.03     -3599  133534           9.0 2.115724
str(c_account_all_data)
## Classes 'data.table' and 'data.frame':   120 obs. of  6 variables:
##  $ Date         : Date, format: "2014-01-01" "2014-02-01" ...
##  $ brent_idx    : num  108 109 106 109 109 ...
##  $ c_account    : num  -4519 -2703 -3264 -4201 -2709 ...
##  $ reserve      : num  124270 127691 126051 129732 130591 ...
##  $ unemployement: num  10.5 10.5 9.4 8.8 8.4 9 9.7 10.1 10.1 10.9 ...
##  $ ex_rate      : num  2.22 2.21 2.22 2.13 2.09 ...
##  - attr(*, ".internal.selfref")=<externalptr>
c_account_search_data <- read_excel("~/Downloads/multiTimeline-8.xlsx")
c_account_search_data <- data.table::as.data.table(c_account_search_data)

colnames(c_account_search_data) <- c("Date", "c_account_count")
c_account_search_data$Date <- as.Date(c_account_search_data$Date)
c_account_merged_data<- merge(c_account_all_data, c_account_search_data, by = "Date", all = TRUE)

Plot the Target Variable

# CHATGPT PROMPT: I want to show both peak and bottom values and labels should be dates with months and years

peak_indices_c <- which(diff(sign(diff(c_account_merged_data$c_account))) == -2) + 1
bottom_indices_c <- which(diff(sign(diff(c_account_merged_data$c_account))) == +2) + 1

peak_changes_c <- abs(c_account_merged_data$c_account[peak_indices_c] - c_account_merged_data$c_account[pmax(peak_indices_c - 2,3)])
bottom_changes_c <- abs(c_account_merged_data$c_account[bottom_indices_c] - c_account_merged_data$c_account[pmax(bottom_indices_c - 2,3)])

significant_peak_indices_c <- peak_indices_c[peak_changes_c > 1500]
significant_bottom_indices_c <- bottom_indices_c[bottom_changes_c > 1500]

combined_indices_c <- union(significant_peak_indices_c, significant_bottom_indices_c)

extreme_data_c <- c_account_merged_data[combined_indices_c, ]
ggplot(c_account_merged_data ,aes(x=Date,y=c_account)) + geom_line() +
geom_point(data = extreme_data_c, aes(x = Date, y = c_account), color = "red", size = 1) +
geom_text(data = extreme_data_c, aes(x = Date, y = c_account, label = format(Date, "%b %Y")), vjust = -1, color = "red", size=3) +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

We can see that there are peaks in a poisitive way during summer especailly August time. This is because Turkey has tourism income during summer holidays. And in winter time since heat consumption increases current account decreases. Therefore we can suggest that there is yearly seasonality in the data and it should be considered during modeling.

Plot the Target Variable vs Search Keyword

ggplot(c_account_merged_data ,aes(x=Date,y=c_account)) + 
geom_line(aes(y = scale(c_account), color = "Current Account")) +
geom_line(aes(y = scale(c_account_count), color = "Search Count of Cari Açık")) +
labs(title = "Current Account vs. Search Count of Cari Açık", x = "Date", y = "Count/Current Account") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

“Cari Açık” is Turkish word for current account and it could be one of the best options to follow the trend in current account of Turkey. This plot shows that although the keyword and current account amount of Turkey is not correlated, during both peaks (after Jan 2018) and bottoms (after April 2021) there is interest in the keyword.

Correlations

ggpairs(c_account_merged_data)

Since our target variable is current account, we can comment on the correlation and relation of this variable with the others. Although there is no information at first sight, we can say that reserve amount has negative correlation with the current account.

Time Series Analysis

Since there is no obvious trend in target variable, we can start decomposing with examining seasonality and trend together.

acf(c_account_merged_data$c_account,36)

We can implement yearly seasonality and examine the residual behavior before implementing independent variables.

c_account_merged_data[,trnd:=1:.N]
c_account_merged_data[,mon:=as.character(month(Date,label=T))]
head(c_account_merged_data)
## Key: <Date>
##          Date brent_idx c_account reserve unemployement  ex_rate
##        <Date>     <num>     <num>   <num>         <num>    <num>
## 1: 2014-01-01    108.16     -4519  124270          10.5 2.216836
## 2: 2014-02-01    108.98     -2703  127691          10.5 2.212760
## 3: 2014-03-01    105.95     -3264  126051           9.4 2.217795
## 4: 2014-04-01    108.63     -4201  129732           8.8 2.127471
## 5: 2014-05-01    109.21     -2709  130591           8.4 2.090805
## 6: 2014-06-01    111.03     -3599  133534           9.0 2.115724
##    c_account_count  trnd    mon
##              <num> <int> <char>
## 1:              48     1    Jan
## 2:              62     2    Feb
## 3:              44     3    Mar
## 4:              43     4    Apr
## 5:              31     5    May
## 6:              29     6    Jun
tmp_c=copy(c_account_merged_data)
tmp_c[,actual:=c_account]
lm_base_c=lm(c_account~trnd+mon,c_account_merged_data)
summary(lm_base_c)
## 
## Call:
## lm(formula = c_account ~ trnd + mon, data = c_account_merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6482  -1438    -22   1483   5169 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3072.584    813.570  -3.777 0.000262 ***
## trnd           -3.033      6.129  -0.495 0.621711    
## monAug       3355.232   1035.209   3.241 0.001587 ** 
## monDec       -764.436   1036.079  -0.738 0.462241    
## monFeb        156.434   1034.991   0.151 0.880146    
## monJan       -537.499   1035.082  -0.519 0.604637    
## monJul       1159.499   1035.082   1.120 0.265136    
## monJun       1056.066   1034.991   1.020 0.309858    
## monMar       -445.933   1034.936  -0.431 0.667423    
## monMay       -560.567   1034.936  -0.542 0.589190    
## monNov       1149.631   1035.807   1.110 0.269535    
## monOct       3840.098   1035.571   3.708 0.000333 ***
## monSep       3224.365   1035.372   3.114 0.002367 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2314 on 107 degrees of freedom
## Multiple R-squared:  0.3455, Adjusted R-squared:  0.2721 
## F-statistic: 4.708 on 12 and 107 DF,  p-value: 4e-06
checkresiduals(lm_base_c)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals
## LM test = 67.678, df = 16, p-value = 2.536e-08
tmp_c[,predicted_trend_mon:=predict(lm_base_c,tmp_c)]
tmp_c[,residual_trend_mon:=actual-predicted_trend_mon]
ggplot(tmp_c ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_mon,color='predicted'))

acf(tmp_c$residual_trend_mon,60)

Although we are already close to white noise, there is still improvement that should be implemented. There is cyclic behaviour in the residuals’s autocorrelation but it repeats in 14-15 months. Since I could not justify that, I will not implement it. Let’s plot the residuals and independent variables.

p1_c <- ggplot(tmp_c, aes(x=brent_idx, y=residual_trend_mon)) +
  geom_point()
p2_c <- ggplot(tmp_c, aes(x=reserve, y=residual_trend_mon)) +
  geom_point()
p3_c <- ggplot(tmp_c, aes(x=unemployement, y=residual_trend_mon)) +
  geom_point()
p4_c <- ggplot(tmp_c, aes(x=ex_rate, y=residual_trend_mon)) +
  geom_point()
p5_c <- ggplot(tmp_c, aes(x=c_account_count, y=residual_trend_mon)) +
  geom_point()
gridExtra::grid.arrange(p1_c, p2_c, p3_c, p4_c, p5_c, nrow=2)

We can implement the unemployement and reserve variables into the model since they show somehow correlations with residuals. But in order not to risk multicolineartiy, let’s first implement reserve and exchange rate variable.

lm_base_c2=lm(c_account~trnd+mon+reserve,c_account_merged_data)
summary(lm_base_c2)
## 
## Call:
## lm(formula = c_account ~ trnd + mon + reserve, data = c_account_merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5369.2 -1413.6   -77.9  1393.8  4683.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2975.19584 1979.92056   1.503 0.135894    
## trnd         -10.50578    6.27739  -1.674 0.097160 .  
## monAug      3584.16877  992.23536   3.612 0.000466 ***
## monDec      -428.68470  995.81434  -0.430 0.667716    
## monFeb       402.83107  992.40671   0.406 0.685625    
## monJan      -320.05608  991.87984  -0.323 0.747576    
## monJul      1273.63471  990.31361   1.286 0.201213    
## monJun      1056.45849  989.63057   1.068 0.288159    
## monMar      -407.81418  989.64505  -0.412 0.681112    
## monMay      -629.78719  989.79790  -0.636 0.525966    
## monNov      1570.68994  998.49006   1.573 0.118684    
## monOct      4166.77418  995.05764   4.187 5.85e-05 ***
## monSep      3393.75395  991.30740   3.424 0.000880 ***
## reserve       -0.05237    0.01577  -3.322 0.001228 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2213 on 106 degrees of freedom
## Multiple R-squared:  0.4072, Adjusted R-squared:  0.3345 
## F-statistic: 5.602 on 13 and 106 DF,  p-value: 1.122e-07
checkresiduals(lm_base_c2)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals
## LM test = 62.613, df = 17, p-value = 3.869e-07
tmp_c[,predicted_trend_mon_res:=predict(lm_base_c2,tmp_c)]
tmp_c[,residual_trend_mon_res:=actual-predicted_trend_mon_res]
ggplot(tmp_c ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_mon_res,color='predicted'))

acf(tmp_c$residual_trend_mon_res,60)

Unfortunately, we still see serial correlation in residuals. This can be handles with differencing but to implement differencing, we should remove seasonality variables from the data.

c_account_merged_data[,lag_1_c_account:=shift(c_account,1)]
c_account_merged_data[,lag_1_diff:=c_account-lag_1_c_account]
tmp_c[,lag_1_c_account:=shift(c_account,1)]
tmp_c[,actual_lag_1_diff:=c_account-lag_1_c_account]
c_account_merged_data = c_account_merged_data[complete.cases(c_account_merged_data)]
acf(c_account_merged_data$lag_1_diff)

acf(c_account_merged_data$lag_1_diff,lag=60)

We can observe that differencing prevents the serial correlation but yearly seasonality should still be implemented.

lm_base_c3=lm(lag_1_diff~mon,c_account_merged_data)
summary(lm_base_c3)
## 
## Call:
## lm(formula = lag_1_diff ~ mon, data = c_account_merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6531.4  -833.8  -101.9  1093.8  6950.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    442.9      632.9   0.700  0.48559    
## monAug        1749.8      895.1   1.955  0.05321 .  
## monDec       -2360.0      895.1  -2.637  0.00962 ** 
## monFeb         248.0      895.1   0.277  0.78226    
## monJan         112.2      919.6   0.122  0.90311    
## monJul        -342.5      895.1  -0.383  0.70274    
## monJun        1170.7      895.1   1.308  0.19371    
## monMar       -1048.3      895.1  -1.171  0.24413    
## monMay       -1006.5      895.1  -1.124  0.26333    
## monNov       -3136.4      895.1  -3.504  0.00067 ***
## monOct         169.8      895.1   0.190  0.84990    
## monSep        -576.8      895.1  -0.644  0.52069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2001 on 107 degrees of freedom
## Multiple R-squared:  0.3222, Adjusted R-squared:  0.2525 
## F-statistic: 4.624 on 11 and 107 DF,  p-value: 9.674e-06
checkresiduals(lm_base_c3)

## 
##  Breusch-Godfrey test for serial correlation of order up to 15
## 
## data:  Residuals
## LM test = 44.503, df = 15, p-value = 9.171e-05
tmp_c[,predicted_diff_mon:=predict(lm_base_c3,tmp_c)]
tmp_c[,residual_diff_mon:=actual_lag_1_diff-predicted_diff_mon]
ggplot(tmp_c ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_diff_mon+lag_1_c_account,color='predicted'))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Conclusion

Throughout the analysis, firstly regression analysis was held but serial correlation did not vanished. Later any relation between independent and residuals could not be found. Therefore differencing is applied to get rid of serial correlation. After lag 1 difference, there were still relation in yearly seasonality, so yearly seasonality is also implemented to model. Results show that we reach the white noise and residuals are not correlated and almost between specified range (blue dotted lines). Predicted values and actual values are also plotted. And since previous current account could be a good indicator in terms of predicting next month, implementing lag1 differencing makes sense. (I am not sure if implementing the yearly seasonality to differencing model by using dummy variables instead of somehow integrating lag 12 difference but it decreased the correlation at the end)

Housing Sales

Motivation

Housing sales are one of the most significant indicators of economic growth of a country. Although people usually tends to buy their own houses, there are many other factors for to buy or sell a house such as interest rates, other investment options, inflation rates etc. Independent variables which are chosen to be used in time series analysis is listed below:

  • BIST 100 Index: Borsa Istanbul is an important option to when it comes to investment. It indicates the health of the companies operating in Turkey and when it is tend to increase, buying houses becomes less attractive to invest in.

  • Interest Rate: Investing the money in the bank for a term and receiving interest and payments with it is another option for investment. Again, when interest rates increases, housing sales drops.

  • Consumer Price Index: Inflation rate and CPI are the measures the value of the money in that country. Since housing market is investment option besides owning a house, it can be used as a protection from inflation.

  • Unit Prices: Prices of houses and overall house market prices also determines the amount of sales in a way. Demands and supplies for houses is also critical for this.

Importing Data

house_sales_all_data <- read_excel("~/Downloads/EVDS-11.xlsx")
house_sales_all_data <- data.table::as.data.table(house_sales_all_data)

colnames(house_sales_all_data) <- c("Date", "house_sales", "bist_idx", "interest_rate", "c_price_idx", "unit_price" )
house_sales_all_data$Date <- as.Date(house_sales_all_data$Date)

head(house_sales_all_data)
##          Date house_sales bist_idx interest_rate c_price_idx unit_price
##        <Date>       <num>    <num>         <num>       <num>      <num>
## 1: 2014-01-01       87639   618.10        7.3140      233.54     1510.3
## 2: 2014-02-01       82597   626.04        9.1850      234.54     1524.5
## 3: 2014-03-01       87617   694.98        9.7125      237.18     1541.3
## 4: 2014-04-01       83610   734.35        9.6100      240.37     1575.2
## 5: 2014-05-01       90377   794.46        9.2820      241.32     1597.8
## 6: 2014-06-01       92936   790.40        8.9700      242.07     1628.4
str(house_sales_all_data)
## Classes 'data.table' and 'data.frame':   120 obs. of  6 variables:
##  $ Date         : Date, format: "2014-01-01" "2014-02-01" ...
##  $ house_sales  : num  87639 82597 87617 83610 90377 ...
##  $ bist_idx     : num  618 626 695 734 794 ...
##  $ interest_rate: num  7.31 9.19 9.71 9.61 9.28 ...
##  $ c_price_idx  : num  234 235 237 240 241 ...
##  $ unit_price   : num  1510 1524 1541 1575 1598 ...
##  - attr(*, ".internal.selfref")=<externalptr>
house_sales_search_data <- read_excel("~/Downloads/multiTimeline-13.xlsx")
house_sales_search_data <- data.table::as.data.table(house_sales_search_data)

colnames(house_sales_search_data) <- c("Date", "tapu_search_count")
house_sales_search_data$Date <- as.Date(house_sales_search_data$Date)
house_sales_merged_data<- merge(house_sales_all_data, house_sales_search_data, by = "Date", all = TRUE)

Plot the Target Variable

peak_indices_h <- which(diff(sign(diff(house_sales_merged_data$house_sales))) == -2) + 1
bottom_indices_h <- which(diff(sign(diff(house_sales_merged_data$house_sales))) == +2) + 1

peak_changes_h <- abs(house_sales_merged_data$house_sales[peak_indices_h] - house_sales_merged_data$house_sales[pmax(peak_indices_h - 1,1)])
bottom_changes_h <- abs(house_sales_merged_data$house_sales[bottom_indices_h] - house_sales_merged_data$house_sales[pmax(bottom_indices_h - 1,1)])

significant_peak_indices_h <- peak_indices_h[peak_changes_h > 14000]
significant_bottom_indices_h <- bottom_indices_h[bottom_changes_h > 14000]

combined_indices_h <- union(significant_peak_indices_h, significant_bottom_indices_h)

extreme_data_h <- house_sales_merged_data[combined_indices_h, ]
ggplot(house_sales_merged_data ,aes(x=Date,y=house_sales)) + geom_line() +
geom_point(data = extreme_data_h, aes(x = Date, y = house_sales), color = "blue", size = 1) +
geom_text(data = extreme_data_h, aes(x = Date, y = house_sales, label = format(Date, "%b %Y")), vjust = -1, color = "blue", size=2.5) +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

In this plot, obvious peaks in December time and significant drops after that can be seen. Significant fluctuations between July 2019 and Jan 2023 may be result of interest rates or other saving and investment options. (Especially due to covid conditions)

Plot the Target Variable vs Search Keyword

ggplot(house_sales_merged_data ,aes(x=Date,y=house_sales)) + 
geom_line(aes(y = scale(house_sales), color = "House Sales")) +
geom_line(aes(y = scale(tapu_search_count), color = "Search Count of Tapu")) +
labs(title = "House Sales vs. Search Count of Tapu", x = "Date", y = "Count/House Sales") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))

“Tapu” is a Turkish word representing the processes and documents during house buying or selling. It is a strong word to see the trends in house sales as we can see in the plot. There is an upward trend in search count since some of the processes is held online in time but fluctuations in both housing sales and search count is correlated.

Correlations

ggpairs(house_sales_merged_data)

In this plot, when we observe the relation of other variables with housing sales (target variable), we can see that interest rates and bist index have somehow negative correlation and also both consumer price index and house unit prices show negative correlation. It would be risky to use both variable in a model since they are strongly correlated between themselves and that would effect the coefficients and interpretability of the model.

Time Series Analysis

Like in current account data, since there is no obvious trend in target variable, we can start decomposing with examining seasonality and trend together.

acf(house_sales_merged_data$house_sales,36)

We can implement yearly seasonality and examine the residual behavior before implementing independent variables.

house_sales_merged_data[,trnd:=1:.N]
house_sales_merged_data[,mon:=as.character(month(Date,label=T))]
head(house_sales_merged_data)
## Key: <Date>
##          Date house_sales bist_idx interest_rate c_price_idx unit_price
##        <Date>       <num>    <num>         <num>       <num>      <num>
## 1: 2014-01-01       87639   618.10        7.3140      233.54     1510.3
## 2: 2014-02-01       82597   626.04        9.1850      234.54     1524.5
## 3: 2014-03-01       87617   694.98        9.7125      237.18     1541.3
## 4: 2014-04-01       83610   734.35        9.6100      240.37     1575.2
## 5: 2014-05-01       90377   794.46        9.2820      241.32     1597.8
## 6: 2014-06-01       92936   790.40        8.9700      242.07     1628.4
##    tapu_search_count  trnd    mon
##                <num> <int> <char>
## 1:                14     1    Jan
## 2:                15     2    Feb
## 3:                14     3    Mar
## 4:                16     4    Apr
## 5:                19     5    May
## 6:                20     6    Jun
tmp_h=copy(house_sales_merged_data)
tmp_h[,actual:=house_sales]
lm_base_h=lm(house_sales~trnd+mon,house_sales_merged_data)
summary(lm_base_h)
## 
## Call:
## lm(formula = house_sales ~ trnd + mon, data = house_sales_merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57448 -14915  -2443  14194 112588 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 90909.29    9157.71   9.927  < 2e-16 ***
## trnd          122.66      68.99   1.778   0.0783 .  
## monAug      24097.67   11652.52   2.068   0.0410 *  
## monDec      58084.53   11662.32   4.981 2.45e-06 ***
## monFeb      -4499.68   11650.07  -0.386   0.7001    
## monJan      -8263.22   11651.09  -0.709   0.4797    
## monJul      16169.32   11651.09   1.388   0.1681    
## monJun      16432.68   11650.07   1.411   0.1613    
## monMar      14627.46   11649.46   1.256   0.2120    
## monMay        589.94   11649.46   0.051   0.9597    
## monNov      20697.19   11659.26   1.775   0.0787 .  
## monOct      20804.65   11656.60   1.785   0.0771 .  
## monSep      24529.21   11654.36   2.105   0.0377 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26050 on 107 degrees of freedom
## Multiple R-squared:  0.3445, Adjusted R-squared:  0.271 
## F-statistic: 4.686 on 12 and 107 DF,  p-value: 4.29e-06
checkresiduals(lm_base_h)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals
## LM test = 32.374, df = 16, p-value = 0.008935
tmp_h[,predicted_trend_mon:=predict(lm_base_h,tmp_h)]
tmp_h[,residual_trend_mon:=actual-predicted_trend_mon]
ggplot(tmp_h ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_mon,color='predicted'))

acf(tmp_h$residual_trend_mon,60)

Although we removed yearly seasonality from the data, R-squared value shows that the model performs poorly. Let’s examine the relation between the residuals and independent varibales.

p1_h <- ggplot(tmp_h, aes(x=bist_idx, y=residual_trend_mon)) +
  geom_point()
p2_h <- ggplot(tmp_h, aes(x=interest_rate, y=residual_trend_mon)) +
  geom_point()
p3_h <- ggplot(tmp_h, aes(x=c_price_idx, y=residual_trend_mon)) +
  geom_point()
p4_h <- ggplot(tmp_h, aes(x=unit_price, y=residual_trend_mon)) +
  geom_point()
p5_h <- ggplot(tmp_h, aes(x=tapu_search_count, y=residual_trend_mon)) +
  geom_point()
gridExtra::grid.arrange(p1_h, p2_h, p3_h, p4_h, p5_h, nrow=2)

To avoid multicolinearity, let’s implement interest rates (since there seems a negative correlation) information to model.

lm_base_h2=lm(house_sales~trnd+mon+interest_rate,house_sales_merged_data)
summary(lm_base_h2)
## 
## Call:
## lm(formula = house_sales ~ trnd + mon + interest_rate, data = house_sales_merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71233 -11766    421  11260  92357 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   104225.9     8889.8  11.724  < 2e-16 ***
## trnd             381.7       84.8   4.501 1.74e-05 ***
## monAug         24696.7    10693.3   2.310   0.0229 *  
## monDec         62353.0    10741.8   5.805 6.79e-08 ***
## monFeb         -5093.1    10691.0  -0.476   0.6348    
## monJan         -8700.3    10691.6  -0.814   0.4176    
## monJul         17856.0    10697.5   1.669   0.0980 .  
## monJun         18637.0    10701.0   1.742   0.0845 .  
## monMar         14066.6    10690.4   1.316   0.1911    
## monMay          1101.9    10690.3   0.103   0.9181    
## monNov         23674.8    10718.3   2.209   0.0293 *  
## monOct         23719.4    10715.1   2.214   0.0290 *  
## monSep         26801.1    10705.6   2.503   0.0138 *  
## interest_rate  -2193.0      477.7  -4.591 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23900 on 106 degrees of freedom
## Multiple R-squared:  0.4532, Adjusted R-squared:  0.3862 
## F-statistic: 6.759 on 13 and 106 DF,  p-value: 2.721e-09
checkresiduals(lm_base_h2)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals
## LM test = 28.947, df = 17, p-value = 0.03502
tmp_h[,predicted_trend_mon_int:=predict(lm_base_h2,tmp_h)]
tmp_h[,residual_trend_mon_int:=actual-predicted_trend_mon_int]
ggplot(tmp_h ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_mon_int,color='predicted'))

p1_h2 <- ggplot(tmp_h, aes(x=bist_idx, y=residual_trend_mon_int)) +
  geom_point()
p2_h2 <- ggplot(tmp_h, aes(x=interest_rate, y=residual_trend_mon_int)) +
  geom_point()
p3_h2 <- ggplot(tmp_h, aes(x=c_price_idx, y=residual_trend_mon_int)) +
  geom_point()
p4_h2 <- ggplot(tmp_h, aes(x=unit_price, y=residual_trend_mon_int)) +
  geom_point()
p5_h2 <- ggplot(tmp_h, aes(x=tapu_search_count, y=residual_trend_mon_int)) +
  geom_point()
gridExtra::grid.arrange(p1_h2, p2_h2, p3_h2, p4_h2, p5_h2, nrow=2)

Let’s implement the search count of “Tapu” word to somehow integrate fluctuations into the model.

lm_base_h3=lm(house_sales~trnd+mon+interest_rate+tapu_search_count,house_sales_merged_data)
summary(lm_base_h3)
## 
## Call:
## lm(formula = house_sales ~ trnd + mon + interest_rate + tapu_search_count, 
##     data = house_sales_merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55560 -11183   -850  11757  47123 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        84505.5     7787.2  10.852  < 2e-16 ***
## trnd                -471.9      136.9  -3.447 0.000817 ***
## monAug             12092.3     8946.2   1.352 0.179387    
## monDec             50940.6     8954.8   5.689 1.17e-07 ***
## monFeb             -8026.0     8782.7  -0.914 0.362889    
## monJan            -10150.1     8776.1  -1.157 0.250076    
## monJul              3504.2     8999.7   0.389 0.697795    
## monJun              5564.8     8965.3   0.621 0.536134    
## monMar              7550.0     8818.8   0.856 0.393877    
## monMay             -1897.1     8782.5  -0.216 0.829402    
## monNov             14275.6     8891.0   1.606 0.111361    
## monOct             15499.5     8866.1   1.748 0.083354 .  
## monSep             16099.0     8908.8   1.807 0.073613 .  
## interest_rate       -958.1      427.5  -2.241 0.027123 *  
## tapu_search_count   1424.3      196.8   7.239 7.77e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19610 on 105 degrees of freedom
## Multiple R-squared:  0.6353, Adjusted R-squared:  0.5866 
## F-statistic: 13.06 on 14 and 105 DF,  p-value: < 2.2e-16
checkresiduals(lm_base_h3)

## 
##  Breusch-Godfrey test for serial correlation of order up to 18
## 
## data:  Residuals
## LM test = 51.414, df = 18, p-value = 4.601e-05
tmp_h[,predicted_trend_mon_int_count:=predict(lm_base_h3,tmp_h)]
tmp_h[,residual_trend_mon_int_count:=actual-predicted_trend_mon_int_count]
ggplot(tmp_h ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_mon_int_count,color='predicted'))

acf(tmp_h$residual_trend_mon_int_count,60)

Conclusion

We started to time series data decomposition by implementing trend and yearly seasonality variables after we examined the autocorrelation of the target variable. Although we saw that residuals reached to white noise model, metrics and residual-independent variables scatter plots confirmed that there could be improvement. To avoid multicolinearity in the model we added only interest rate to the lineer model. After that to implement fluctuations to the model, search count of “Tapu” word is also added to the model. As it can be seen in both R-square metric and actual-predicted value plot model showed great improvement.

Term Deposit Amount (1 Month)

Motivation

Large number of investors consider term deposit (vadeli mevduat in Turkish) as a safe investment. Although it attracts many people, overall term deposit amount in the banks fluctuates especially on the countries with fluctuating interest rates and inflation. In this time series analysis, term deposit amount in Turkish Liras will be analyzed with the independent variables below:

  • Exchange Rates: While one option is incesting in term deposit, investing in currency is another option to directly effects the amount of deposit in the banks. Increasing exchange rates could lead people to invest in currency instead of term deposits.

  • Interest Rate: High interest rates are the main reason people want to invest their money in bank deposits. High interest rates (especially higher than inflation or increase in exchange rate) would lead to higher amount of money in bank term deposits.

  • Consumer Price Index: Inflation rate and CPI are the main reasons people want to invest their money and protect their buying power. High inflation discourages people to invest in term deposits since the real value of the money in the deposit may decrease as time goes.

  • House Price Index: Although this does not necessarily affects the amount of money in bank term deposits, house price index can be seen as an alternative option to invest depending on the amount of interest rates.

Importing Data

term_deposit_all_data <- read_excel("~/Downloads/EVDS-12.xlsx")
term_deposit_all_data <- data.table::as.data.table(term_deposit_all_data)

colnames(term_deposit_all_data) <- c("Date", "ex_rate", "interest_rate", "deposit_amount", "consumer_price_idx", "house_price_idx")
term_deposit_all_data$Date <- as.Date(term_deposit_all_data$Date)

head(term_deposit_all_data)
##          Date  ex_rate interest_rate deposit_amount consumer_price_idx
##        <Date>    <num>         <num>          <num>              <num>
## 1: 2014-01-01 2.216836        7.3140       71157129             233.54
## 2: 2014-02-01 2.212760        9.1850       69734191             234.54
## 3: 2014-03-01 2.217795        9.7125       64678809             237.18
## 4: 2014-04-01 2.127471        9.6100       66501577             240.37
## 5: 2014-05-01 2.090805        9.2820       70765088             241.32
## 6: 2014-06-01 2.115724        8.9700       73326895             242.07
##    house_price_idx
##              <num>
## 1:            64.1
## 2:            64.8
## 3:            65.7
## 4:            66.8
## 5:            67.5
## 6:            68.2
str(term_deposit_all_data)
## Classes 'data.table' and 'data.frame':   120 obs. of  6 variables:
##  $ Date              : Date, format: "2014-01-01" "2014-02-01" ...
##  $ ex_rate           : num  2.22 2.21 2.22 2.13 2.09 ...
##  $ interest_rate     : num  7.31 9.19 9.71 9.61 9.28 ...
##  $ deposit_amount    : num  71157129 69734191 64678809 66501577 70765088 ...
##  $ consumer_price_idx: num  234 235 237 240 241 ...
##  $ house_price_idx   : num  64.1 64.8 65.7 66.8 67.5 68.2 69.3 70.2 70.9 71.5 ...
##  - attr(*, ".internal.selfref")=<externalptr>
term_deposit_search_data <- read_excel("~/Downloads/multiTimeline-14.xlsx")
term_deposit_search_data <- data.table::as.data.table(term_deposit_search_data)

colnames(term_deposit_search_data) <- c("Date", "vadeli_mevduat_count")
term_deposit_search_data$Date <- as.Date(term_deposit_search_data$Date)
term_deposit_merged_data<- merge(term_deposit_all_data, term_deposit_search_data, by = "Date", all = TRUE)

Plot the Target Variable

peak_indices_t <- which(diff(sign(diff(term_deposit_merged_data$deposit_amount))) == -2) + 1
bottom_indices_t <- which(diff(sign(diff(term_deposit_merged_data$deposit_amount))) == +2) + 1
# Calculate percentage changes for peaks and troughs
peak_percentage_changes_t <- 100 * abs(((term_deposit_merged_data$deposit_amount[peak_indices_t] / term_deposit_merged_data$deposit_amount[pmax(peak_indices_t - 1, 1)]) - 1))
bottom_percentage_changes_t <- 100 * abs(((term_deposit_merged_data$deposit_amount[bottom_indices_t] / term_deposit_merged_data$deposit_amount[pmax(bottom_indices_t - 1, 1)]) - 1))

# Filter significant peaks and troughs based on percentage change
significant_peak_indices_t <- peak_indices_t[peak_percentage_changes_t > 6]  # Adjust the threshold as needed
significant_bottom_indices_t <- bottom_indices_t[bottom_percentage_changes_t > 6]  # Adjust the threshold as needed

# Combine indices for significant peaks and troughs
combined_indices_t <- union(significant_peak_indices_t, significant_bottom_indices_t)

# Extract data for significant peaks and troughs
extreme_data_t <- term_deposit_merged_data[combined_indices_t, ]

# Plot with logarithmic scale and percentage labels
ggplot(term_deposit_merged_data, aes(x = Date, y = deposit_amount)) +
  geom_line() +
  scale_y_log10() +  # Apply logarithmic scale
  geom_point(data = extreme_data_t, aes(x = Date, y = deposit_amount), color = "purple", size = 2) +
  geom_text(data = extreme_data_t, aes(x = Date, y = deposit_amount, label=format(Date, "%b %Y")), vjust = -1, color = "purple", size = 2) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Term deposits show a clear uptrend through time. Although there could be many factors forcing this trend, the main reason should be the exchange rates and inflation.

Plot the Target Variable vs Search Keyword

ggplot(term_deposit_merged_data ,aes(x=Date,y=deposit_amount)) + 
geom_line(aes(y = scale(deposit_amount), color = "Term Deposit Amount")) +
geom_line(aes(y = scale(vadeli_mevduat_count), color = "Search Count of Vadeli Mevduat")) +
labs(title = "Term Deposit Amount vs. Search Count of Vadeli Mevduat", x = "Date", y = "Count/Deposit Amount") +
scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = c(0.2, 0.8))

“Vadeli mevduat” is a Turkish word for term deposit. Although it is not obvious due to scaling at the beginning of the plot, there is correlation between search count and amount of term deposits. After 2018 both of them shows upward trend. Search count showed some picks during some intervals. This could be due to sudden government policies in economy.

Correlations

ggpairs(term_deposit_merged_data)

In this plot, deposit amount shows strong correlation with many independent variables. This would be nice if the correlation among this independent variables would not be high as well (Multicolinearity). This should be taken care of during modeling.

Time Series Analysis

In this target variable we see an obvious trend upwards throghout the years. Let’s see the autocorrelation plot of target variable.

acf(term_deposit_merged_data$deposit_amount,36)

Seasonality does not show up in the autocorrelation.

term_deposit_merged_data[,trnd:=1:.N]
head(term_deposit_merged_data)
## Key: <Date>
##          Date  ex_rate interest_rate deposit_amount consumer_price_idx
##        <Date>    <num>         <num>          <num>              <num>
## 1: 2014-01-01 2.216836        7.3140       71157129             233.54
## 2: 2014-02-01 2.212760        9.1850       69734191             234.54
## 3: 2014-03-01 2.217795        9.7125       64678809             237.18
## 4: 2014-04-01 2.127471        9.6100       66501577             240.37
## 5: 2014-05-01 2.090805        9.2820       70765088             241.32
## 6: 2014-06-01 2.115724        8.9700       73326895             242.07
##    house_price_idx vadeli_mevduat_count  trnd
##              <num>                <num> <int>
## 1:            64.1                   14     1
## 2:            64.8                   14     2
## 3:            65.7                    8     3
## 4:            66.8                    7     4
## 5:            67.5                    7     5
## 6:            68.2                   10     6
tmp_t=copy(term_deposit_merged_data)
tmp_t[,actual:=deposit_amount]
lm_base_t=lm(deposit_amount~trnd,term_deposit_merged_data)
summary(lm_base_t)
## 
## Call:
## lm(formula = deposit_amount ~ trnd, data = term_deposit_merged_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -205698500 -139043568  -55135080  110781709  534016332 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -128276253   31237898  -4.106 7.44e-05 ***
## trnd           6959952     448081  15.533  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.7e+08 on 118 degrees of freedom
## Multiple R-squared:  0.6716, Adjusted R-squared:  0.6688 
## F-statistic: 241.3 on 1 and 118 DF,  p-value: < 2.2e-16
checkresiduals(lm_base_t)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 114.86, df = 10, p-value < 2.2e-16
tmp_t[,predicted_trend:=predict(lm_base_t,tmp_t)]
tmp_t[,residual_trend:=actual-predicted_trend]

Residuals show high autocorrelation. Although this can be handled with differencing let’s first examine the independent variables.

p1_t <- ggplot(tmp_t, aes(x=ex_rate, y=residual_trend)) +
  geom_point()
p2_t <- ggplot(tmp_t, aes(x=interest_rate, y=residual_trend)) +
  geom_point()
p3_t <- ggplot(tmp_t, aes(x=consumer_price_idx, y=residual_trend)) +
  geom_point()
p4_t <- ggplot(tmp_t, aes(x=house_price_idx, y=residual_trend)) +
  geom_point()
p5_t <- ggplot(tmp_t, aes(x=vadeli_mevduat_count, y=residual_trend)) +
  geom_point()
gridExtra::grid.arrange(p1_t, p2_t, p3_t, p4_t, p5_t, nrow=2)

For the sake of increasing accuracy of the model, lineer terms are added to the model.

term_deposit_merged_data[,ex_rate_sq:=ex_rate^2]
term_deposit_merged_data[,consumer_price_idx_sq:=consumer_price_idx^2]
tmp_t[,ex_rate_sq:=ex_rate^2]
tmp_t[,consumer_price_idx_sq:=consumer_price_idx^2]
lm_base_t1 = lm(deposit_amount~.,data=term_deposit_merged_data)
summary(lm_base_t1)
## 
## Call:
## lm(formula = deposit_amount ~ ., data = term_deposit_merged_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -123764228  -22379406      41345   19816644  146213075 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.746e+09  8.430e+10   0.021  0.98351    
## Date                  -1.064e+05  5.255e+06  -0.020  0.98389    
## ex_rate                2.684e+07  1.753e+07   1.531  0.12855    
## interest_rate         -6.125e+06  1.346e+06  -4.552 1.38e-05 ***
## consumer_price_idx    -4.275e+05  3.935e+05  -1.086  0.27969    
## house_price_idx        1.241e+06  1.456e+05   8.524 9.12e-14 ***
## vadeli_mevduat_count   7.452e+05  5.451e+05   1.367  0.17437    
## trnd                   4.840e+06  1.600e+08   0.030  0.97592    
## ex_rate_sq            -1.273e+06  4.622e+05  -2.753  0.00691 ** 
## consumer_price_idx_sq  2.020e+02  1.382e+02   1.461  0.14686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38580000 on 110 degrees of freedom
## Multiple R-squared:  0.9842, Adjusted R-squared:  0.9829 
## F-statistic:   763 on 9 and 110 DF,  p-value: < 2.2e-16
checkresiduals(lm_base_t1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 13
## 
## data:  Residuals
## LM test = 68.831, df = 13, p-value = 1.317e-09
tmp_t[,predicted_trend_sq:=predict(lm_base_t1,tmp_t)]
tmp_t[,residual_trend_sq:=actual-predicted_trend_sq]
p1_t2 <- ggplot(tmp_t, aes(x=ex_rate, y=residual_trend_sq)) +
  geom_point()
p2_t2 <- ggplot(tmp_t, aes(x=interest_rate, y=residual_trend_sq)) +
  geom_point()
p3_t2 <- ggplot(tmp_t, aes(x=consumer_price_idx, y=residual_trend_sq)) +
  geom_point()
p4_t2 <- ggplot(tmp_t, aes(x=house_price_idx, y=residual_trend_sq)) +
  geom_point()
p5_t2 <- ggplot(tmp_t, aes(x=vadeli_mevduat_count, y=residual_trend_sq)) +
  geom_point()
gridExtra::grid.arrange(p1_t2, p2_t2, p3_t2, p4_t2, p5_t2, nrow=2)

ggplot(tmp_t ,aes(x=Date)) +
        geom_line(aes(y=actual,color='real')) + 
        geom_line(aes(y=predicted_trend_sq,color='predicted'))

Conclusion

Term deposit amount in Turkey has shown upwards trend in years. There were no evidence indicating any seasonality. Therefore trend variable is implemented to the model. There were high serial correlation in the residuals. Residuals had relation with independent variables more similar to square terms. Square terms of independent variables has been implemented but lineer terms showed better performance in terms of serial correlation of residuals.

Correlation Between Target Variables

all_target_data <- cbind(c_account_all_data$c_account, house_sales_all_data$house_sales, term_deposit_all_data$deposit_amount)

all_target_data <- as.data.table(all_target_data)

setnames(all_target_data, c("c_account", "house_sales", "deposit_amount"))

print(all_target_data)
##      c_account house_sales deposit_amount
##          <num>       <num>          <num>
##   1:     -4519       87639       71157129
##   2:     -2703       82597       69734191
##   3:     -3264       87617       64678809
##   4:     -4201       83610       66501577
##   5:     -2709       90377       70765088
##  ---                                     
## 116:      -376      122091      991320641
## 117:      1996      102656     1001188177
## 118:       122       93761     1125552602
## 119:     -2796       93514     1044564495
## 120:     -2126      138577     1240934320
# CHATGPT PROMPT: I need to create a data table taking one column from each data table and merge them.
cor_matrix <- cor(all_target_data)

corrplot(cor_matrix, method = "color", type = "upper", order = "hclust",
         addCoef.col = "black",
         tl.col = "black", tl.srt = 45)